In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np

%matplotlib inline

In [67]:
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']
books = pd.read_csv('BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)

books["Book-Title"].nunique() == books["ISBN"].nunique()
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
books["Book-Title"].nunique() == books["new_isbn"].nunique()
books['isbn'] = books['new_isbn']

del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']
del books['ISBN']
del books['new_isbn']

newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(newdf.shape)


(433671, 3)

In [92]:
bookinfo = pd.read_csv("goodreads_list_props.csv")
bookinfo2 = pd.read_csv("goodreads_list_props1.csv")

In [93]:
import pickle
bookinfo3 = pd.read_pickle("ibsn_features_full.pickle")

In [94]:
bookinfo.columns


Out[94]:
Index(['book_name', 'author', 'rating', 'votes', 'description', 'book_type',
       'no_of_pages', 'first_published', 'isbn13', 'genre', 'link'],
      dtype='object')

In [95]:
bookinfo2.columns


Out[95]:
Index(['book_name', 'author', 'rating', ' votes', ' description', 'book_type',
       'no_of_pages', 'first_published', 'isbn13', 'genre', 'link'],
      dtype='object')

In [96]:
bookinfo3.columns


Out[96]:
Index(['isbn', 'description', 'num_pages', 'title'], dtype='object')

In [97]:
bookinfo3.columns = ['isbn13','description','no_of_pages','book_name']

In [98]:
bookinfo2.columns = bookinfo.columns
bookinfo = pd.concat([bookinfo,bookinfo2])
bookinfo = bookinfo[['isbn13','description','no_of_pages','book_name']]
bookinfo = pd.concat([bookinfo,bookinfo3])
bookinfo.drop_duplicates(inplace = True)

In [99]:
books.drop_duplicates(subset = 'isbn',inplace = True)

In [18]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys


def is_isbn10_valid(isbn):
    """
    Check ISBN-10 is valid.
    Code Implementaion from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 10:
        return False
    if ((not isbn[0:9].isdigit()) or
            ((isbn[-1] != 'X') and (not isbn[-1].isdigit()))):
        return False
    result = sum((10 - i) * (int(x) if x != 'X' else 10)
                 for i, x in enumerate(isbn))
    return result % 11 == 0


def is_isbn13_valid(isbn):
    """
    Check ISBN-13 is valid.
    Code Implemetation from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 13 or isbn.isdigit() is not True:
        return False
    check = (10 - (sum(int(digit) * (3 if idx % 2 else 1)
                       for idx, digit in enumerate(isbn[:12])) % 10)) % 10
    return check == int(isbn[-1])


def isbn13_to_isbn10(isbn13_str):
    """
    Convert ISBN-13 to ISBN-10.
    """
    num = 11 - sum((10 - i) * (int(x))
                   for i, x in enumerate(isbn13_str[3:12])) % 11
    if num == 10:
        check_digit = 'X'
    elif num == 11:
        check_digit = 0
    else:
        check_digit = num
    return isbn13_str[3:12] + str(check_digit)


def isbn10_to_isbn13(isbn10_str):
    """
    Convert ISBN-10 to ISBN-13.
    """
    check_digit = (
        10 - (sum(int(digit) * (3 if idx % 2 else 1)
                  for idx, digit in enumerate('978' + isbn10_str[:9])
                  ) % 10)) % 10
    return '978' + isbn10_str[:9] + str(check_digit)


def isbn_converter(isbn):
    """
    Convert isbn format to another format.
    """
    if is_isbn10_valid(isbn):
        result = isbn10_to_isbn13(isbn)
    elif is_isbn13_valid(isbn):
        result = isbn13_to_isbn10(isbn)
    else:
        return None
    return result


if __name__ == "__main__":
    for isbn_str in sys.argv[1:]:
        the_result = isbn_converter(isbn_str)
        if the_result:
            print(the_result)
        else:
            print("Bad ISBN " + isbn_str)


Bad ISBN -f
Bad ISBN C:\Users\vijay\AppData\Roaming\jupyter\runtime\kernel-54701759-7a9b-40b3-aed1-ecb74bfa38c3.json

In [19]:
isbn13 = []
for i in books['isbn']:
    isbn13.append(isbn_converter(i))

In [25]:
books['isbn13'] = isbn13

In [26]:
books.dropna(subset = ['isbn13'],inplace = True)
bookinfo.dropna(subset = ['isbn13'],inplace = True)

In [27]:
mergedinfo = bookinfo.merge(books,on = 'isbn13',how = 'inner')

In [28]:
import re
def striphtml(data):
    p = re.compile('<.*?>')
    try:
        return p.sub('', data)
    except:
        return None

In [29]:
mergedinfo['description'] = mergedinfo['description'].apply(lambda x: striphtml(x))
mergedinfo['description'] = mergedinfo['description'].str.strip()
mergedinfo['description'] = mergedinfo['description'].str.replace('“','').str.replace(',','').str.replace('"','')

In [30]:
from nltk.corpus import stopwords
# ...
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in mergedinfo['description']:
    try:
        words = desc.split()
        filtereddesc.append([word for word in words if word not in stops])
    except:
        filtereddesc.append(None)

In [31]:
mergedinfo['filtered_description'] = filtereddesc

In [32]:
wordlist = []
for descs in mergedinfo['filtered_description']:
    sentence = []
    if descs is not None:
        for word in descs:
            sentence.append(word)
    wordlist.append(sentence)

Download google's word2vec model before running next line


In [33]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [34]:
genres = ['Science','Satire','Drama','Action','Romance','Mystery','Horror','Travel','Children','Religion','History','Biography','Autobiography','Fantasy']

In [35]:
scores = []
for desc in mergedinfo['filtered_description']:
    if desc is not None:
        gscore = []
        for genre in genres:
            simsum = 0
            n = 0
            for word in desc:
                try:
                    simsum = simsum + model.similarity(word,genre)
                    n = n + 1
                except:
                    continue
            if n!=0:
                gscore.append((simsum)/n)
            else:
                gscore.append(0)
        scores.append(gscore)
    else:
        scores.append(None)

In [36]:
editedscores = []
for score in scores:
    if score is not None:
        editedscores.append(score)
    else:
        editedscores.append([0] * 14)

In [37]:
scoredf = pd.DataFrame(editedscores,columns = [genre + '_Score' for genre in genres])

In [38]:
bookfeatures = pd.concat([mergedinfo,scoredf],axis = 1)

AMAZON DATASET


In [53]:
newbooks = pd.read_csv("Combine.csv")
newbooksisbn = newbooks['isbn']
newbooksisbn13 = []

for i in newbooksisbn:
    newbooksisbn13.append(isbn_converter(i))

newbooksuniqueisbn13 = list(set(newbooksisbn13))
amazonbookfeatures = bookinfo[bookinfo['isbn13'].isin(newbooksuniqueisbn13)]

In [44]:
amazonbookfeatures['description'] = amazonbookfeatures['description'].apply(lambda x: striphtml(x))
amazonbookfeatures['description'] = amazonbookfeatures['description'].str.strip()
amazonbookfeatures['description'] = amazonbookfeatures['description'].str.replace('“','').str.replace(',','').str.replace('"','')


C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [45]:
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in amazonbookfeatures['description']:
    try:
        words = desc.split()
        filtereddesc.append([word for word in words if word not in stops])
    except:
        filtereddesc.append(None)

In [46]:
amazonbookfeatures['filtered_description'] = filtereddesc
wordlist = []
for descs in amazonbookfeatures['filtered_description']:
    sentence = []
    if descs is not None:
        for word in descs:
            sentence.append(word)
    wordlist.append(sentence)


C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [47]:
scores = []
for desc in amazonbookfeatures['filtered_description']:
    if desc is not None:
        gscore = []
        for genre in genres:
            simsum = 0
            n = 0
            for word in desc:
                try:
                    simsum = simsum + model.similarity(word,genre)
                    n = n + 1
                except:
                    continue
            if n!=0:
                gscore.append((simsum)/n)
            else:
                gscore.append(0)
        scores.append(gscore)
    else:
        scores.append(None)

In [48]:
editedscores = []
for score in scores:
    if score is not None:
        editedscores.append(score)
    else:
        editedscores.append([0] * 14)

In [49]:
scoredf = pd.DataFrame(editedscores,columns = [genre + '_Score' for genre in genres])

In [61]:
amzbookfeatures = pd.concat([amazonbookfeatures.reset_index(drop=True),scoredf],axis = 1)